Analyze Station Locations in Vancouver

In [6]:
import requests
import pandas as pd
import numpy as np
import zipfile

#creat map
import folium
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
In [7]:
# the GTFS zipfile is loaded into GTFS: a dictionary of dataframes
gtfs_zip = zipfile.ZipFile("ProjectData/google_transit.zip")
GTFS = {}
for table in gtfs_zip.namelist():
    table_name = table.split(".")[0]
    GTFS[table_name] = pd.read_csv(gtfs_zip.open(table))
print(GTFS.keys())
dict_keys(['agency', 'calendar', 'calendar_dates', 'feed_info', 'routes', 'shapes', 'stops', 'stop_times', 'transfers', 'trips', 'cardinal_directions_exceptions', 'direction_names_exceptions', 'stop_order_exceptions'])
In [8]:
# 8916 stops
# each skytrain platform has a parent_station
print(GTFS["stops"].loc[GTFS["stops"]["parent_station"]==99917])
print("---------------------------------------")
print(GTFS["stops"].loc[GTFS["stops"]["stop_id"]==99917])
      stop_id  stop_code                               stop_name  stop_desc  \
7854     8044    57969.0  Commercial-Broadway Station Platform 4        NaN   
7886     8073    57998.0  Commercial-Broadway Station Platform 3        NaN   
8183     8754    60822.0  Commercial-Broadway Station Platform 1        NaN   
8192     8763    60823.0  Commercial-Broadway Station Platform 2        NaN   

       stop_lat    stop_lon zone_id  stop_url  location_type  parent_station  
7854  49.262317 -123.069179    ZN 1       NaN              0         99917.0  
7886  49.262311 -123.069077    ZN 1       NaN              0         99917.0  
8183  49.262980 -123.068426    ZN 1       NaN              0         99917.0  
8192  49.262912 -123.068491    ZN 1       NaN              0         99917.0  
---------------------------------------
      stop_id  stop_code                    stop_name  stop_desc  stop_lat  \
8874    99917        NaN  Commercial-Broadway Station        NaN  49.26267   

        stop_lon zone_id  stop_url  location_type  parent_station  
8874 -123.068765    ZN 1       NaN              1             NaN  
In [9]:
# we are going to generate two station lists
# 1. existing rapid transit stations (parent_station): skytrain, seabus and west cost express
# 2. canadidate stations: exiting bus stops

RapidTransitStop_list = GTFS["stops"]["parent_station"].dropna().unique().tolist()
print("%d rapid transit stations"%len(RapidTransitStop_list))
RapidTransitStop_df = GTFS["stops"].loc[GTFS["stops"]["stop_id"].isin(RapidTransitStop_list)]
print(RapidTransitStop_df.head())
print(RapidTransitStop_df.shape)

#bus stops are rows where parent_station is NaN and stop_id not in the RapidTransitStop_list
BusStop_filter = ~GTFS["stops"]["stop_id"].isin(RapidTransitStop_list)
BusStop_filter = BusStop_filter & (GTFS["stops"]["parent_station"].isnull())
BusStop_df = GTFS["stops"].loc[BusStop_filter]
print("%d bus stops"%len(BusStop_df))
print(BusStop_df.head())
print(BusStop_df.shape)
59 rapid transit stations
      stop_id  stop_code                   stop_name  stop_desc   stop_lat  \
1466    12034        NaN          Waterfront Station        NaN  49.285687   
8858    99901        NaN         YVR-Airport Station        NaN  49.194174   
8859    99902        NaN   Sea Island Centre Station        NaN  49.192986   
8860    99903        NaN           Templeton Station        NaN  49.196688   
8861    99904        NaN  Richmond-Brighouse Station        NaN  49.167943   

        stop_lon zone_id  stop_url  location_type  parent_station  
1466 -123.111773    ZN 1       NaN              1             NaN  
8858 -123.178269    ZN 2       NaN              1             NaN  
8859 -123.157887    ZN 2       NaN              1             NaN  
8860 -123.146337    ZN 2       NaN              1             NaN  
8861 -123.136372    ZN 2       NaN              1             NaN  
(59, 10)
8726 bus stops
   stop_id  stop_code                          stop_name  stop_desc  \
0    10000    59326.0   Northbound No. 5 Rd @ McNeely Dr        NaN   
1    10001    59324.0  Northbound No. 5 Rd @ Woodhead Rd        NaN   
2    10002    59323.0    Southbound No. 5 Rd @ Cambie Rd        NaN   
3    10003    59325.0  Southbound No. 5 Rd @ Woodhead Rd        NaN   
4    10004    59327.0    Eastbound McNeely Dr @ No. 5 Rd        NaN   

    stop_lat    stop_lon zone_id  stop_url  location_type  parent_station  
0  49.179962 -123.091490  BUS ZN       NaN              0             NaN  
1  49.182670 -123.091448  BUS ZN       NaN              0             NaN  
2  49.184252 -123.091627  BUS ZN       NaN              0             NaN  
3  49.182051 -123.091659  BUS ZN       NaN              0             NaN  
4  49.179586 -123.091105  BUS ZN       NaN              0             NaN  
(8726, 10)
In [10]:
# show rapid transit station locations on map

# map centre lat,long
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'New Westminster, BC'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of rapid transit stations using latitude and longitude values
Station_Map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add bus stop markers to map
for lat, lng, stopid, stopname in zip(BusStop_df['stop_lat'], BusStop_df['stop_lon'], BusStop_df['stop_id'], BusStop_df['stop_name']):
    label = '{}, {}'.format(stopid, stopname)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='#9b99c4',
        fill=True,
        fill_color='#9b99c4',
        fill_opacity=0.1,
        parse_html=False).add_to(Station_Map)  

# add rapid transit markers to map
for lat, lng, stopid, stopname in zip(RapidTransitStop_df['stop_lat'], RapidTransitStop_df['stop_lon'], RapidTransitStop_df['stop_id'], RapidTransitStop_df['stop_name']):
    label = '{}, {}'.format(stopid, stopname)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.9,
        parse_html=False).add_to(Station_Map)  
    
Station_Map
Out[10]:
In [ ]:
## Foursquare API - Venue Data

#get nearby venue - utility function
CLIENT_ID = 'RBSGZRHE5H03FMTM43053AYPDNC2TJNJO0W50FMFWNVJSHQT' # your Foursquare ID
CLIENT_SECRET = 'FIDHWOGNIKYG1KYWVAGIIH3B2WRO32Y4OLQOZVOUOUBZK2NG' # your Foursquare Secret
VERSION = '20171101' # Foursquare API version
radius = 400 #m
LIMIT = 100 #no more than 100 venues per neighbourhood

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            response = requests.get(url).json()["response"]
            results = response['groups'][0]['items']
        except:
            continue # to the next record, no venue around
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['id'],
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['stop_id', 
                  'stop_lat', 
                  'stop_lon', 
                  'VenueID',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
In [ ]:
rapidtransit_venues = getNearbyVenues(names=RapidTransitStop_df['stop_id'],
                                     latitudes=RapidTransitStop_df['stop_lat'],
                                     longitudes=RapidTransitStop_df['stop_lon']
                                  )
rapidtransit_venues.to_csv("ProjectData/API_rapidtransit_venues.csv",index=False)
In [ ]:
# input - df: a Dataframe, chunkSize: the chunk size
# output - a list of DataFrame
# purpose - splits the DataFrame into smaller of max size chunkSize (last is smaller)
def splitDataFrameIntoSmaller(df, chunkSize = 100): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf
In [ ]:
#split the bus stop dataframe into smaller chunks to query Foursquare API data 
BusStop_df_sections = splitDataFrameIntoSmaller(BusStop_df)
print("%d sections of (max) 100 stops"%len(BusStop_df_sections))
In [ ]:
# range 1 to 88 
for section in range(-1):
    rapidtransit_venues = getNearbyVenues(names=BusStop_df_sections[section]['stop_id'],
                                         latitudes=BusStop_df_sections[section]['stop_lat'],
                                         longitudes=BusStop_df_sections[section]['stop_lon']
                                      )
    rapidtransit_venues.to_csv("ProjectData/API_bus_venues_section%d.csv"%(section+1),index=False)
In [ ]:
# check Foursquare API limit
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    40.794123, 
    -73.953961, 
    500, 
    1)
requests.get(url).json()
# note: code 429 means you are over the daily call quota limit
# reset midnight UTC (5pm Vancouver Time)
In [11]:
# load all the venues from API calls
import os
directory = os.fsencode("ProjectData")
venue_df_list = []
for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if "API" in filename and filename.endswith(".csv"): 
        venue_df_list.append(pd.read_csv("ProjectData/"+filename))
venue_df = pd.concat(venue_df_list, axis=0)

# check the venue categories
print('{} stops in the venue dataframe.'.format(len(venue_df['stop_id'].unique())))
print('There are {} uniques categories.'.format(len(venue_df['Venue Category'].unique())))
print('There are {} uniques venues.'.format(len(venue_df['VenueID'].unique())))
print(venue_df.head())
8340 stops in the venue dataframe.
There are 451 uniques categories.
There are 10861 uniques venues.
   stop_id   stop_lat    stop_lon                   VenueID  \
0     2660  49.209736 -122.994325  4bae7801f964a5203db63be3   
1     2660  49.209736 -122.994325  551c1d13498e6b30fa879f25   
2     2660  49.209736 -122.994325  4bd8948011dcc9280feef633   
3     2660  49.209736 -122.994325  4bf997345ec320a1bf848ad3   
4     2660  49.209736 -122.994325  4ba545dff964a5208df438e3   

                                  Venue  Venue Latitude  Venue Longitude  \
0                Gardenworks Mandeville       49.206704      -122.998270   
1  Riverway Golf Course & Driving Range       49.207437      -122.996536   
2                     Sun Tai Sang Farm       49.206914      -122.994924   
3                     Wing Wong Nursery       49.206530      -122.993784   
4                          Garden Works       49.206706      -122.998498   

   Venue Category  
0   Garden Center  
1     Golf Course  
2  Farmers Market  
3   Garden Center  
4   Garden Center  
In [12]:
# check the venues around a station (specify stop_id) on a map
stopid = 99930

latitude = GTFS["stops"].loc[GTFS["stops"]["stop_id"]==stopid]["stop_lat"].tolist()[0]
longitude = GTFS["stops"].loc[GTFS["stops"]["stop_id"]==stopid]["stop_lon"].tolist()[0]
stopname = GTFS["stops"].loc[GTFS["stops"]["stop_id"]==stopid]["stop_name"].tolist()[0]

station_venues = venue_df.loc[venue_df["stop_id"]==stopid]
print("%d venue(s) near %s"%(len(station_venues),stopname))

# create map of rapid transit stations using latitude and longitude values
StationVenue_Map = folium.Map(location=[latitude, longitude], tiles='cartodbpositron', zoom_start=16)

# add venue markers to map
for lat, lng, venuename, venuetype in zip(station_venues['Venue Latitude'], station_venues['Venue Longitude'], station_venues['Venue'], station_venues['Venue Category']):
    label = '{}, {}'.format(venuename, venuetype)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='#d17389',
        fill=True,
        fill_color='#d17389',
        fill_opacity=0.9,
        parse_html=False).add_to(StationVenue_Map)  
    
# add the stop marker to map
label = '{}, {}'.format(stopid, stopname)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
        [latitude, longitude],
        radius=10,
        popup=label,
        color='#8a1631',
        fill=True,
        fill_color='#8a1631',
        fill_opacity=0.9,
        parse_html=False).add_to(StationVenue_Map)  

StationVenue_Map
21 venue(s) near Coquitlam Central Station
Out[12]:
In [ ]:
#print(venue_df['Venue Category'].unique())
venue_type = venue_df[['Venue Category']].drop_duplicates()
venue_type["Purpose"] = "Other"
venue_type_lookup = [["Restaurant","Restaurant"], #search keyword, type label 
                     ["Bar","Restaurant"],
                     ["Pizza","Restaurant"],
                     ["Sandwich","Restaurant"],
                     ["Cafe","Restaurant"],
                     ["Café","Restaurant"],
                     ["Diner","Restaurant"],
                     ["Burger","Restaurant"],
                     ["Deli","Restaurant"],
                     ["Breakfast","Restaurant"],
                     ["Food","Restaurant"],
                     ["Joint","Restaurant"],
                     ["Salad","Restaurant"],
                     ["Snack","Restaurant"],
                     ["Taco","Restaurant"],
                     ["Soup","Restaurant"],
                     ["Fried Chicken","Restaurant"],
                     ["Store","Store"],
                     ["Shop","Store"],
                     ["Supermarket","Store"],
                     ["Gastropub","Store"],
                     ["Trail","Park"],
                     ["Public Art","Park"],
                     ["Park","Park"],
                     ["Bank","Bank"],
                     ["Chiropractor","HealthCare"],
                     ["Physical Therapist","HealthCare"],
                     ["Pharmacy","HealthCare"],
                     ["Message","HealthCare"],
                     ["Dispensary","HealthCare"],
                     ["Hotel","Hotel"],
                     ["Motel","Hotel"],
                     ["Inn","Hotel"],
                     ["School","School"],
                     ["Office","Office"],
                     ["Tech Startup","Office"],
                     ["Lawyer","Office"],
                     ["Gym","Recreational"],
                     ["Yoga","Recreational"],
                     ["Dance","Recreational"],
                     ["Golf","Recreational"],
                     ["Laser Tag","Recreational"],
                     ["Entertainment","Recreational"],
                     ["Rock Club","Recreational"],
                     ["Photography Studio","Recreational"],
                     ["Stadium","Recreational"],
                     ["Bowling Alley","Recreational"],
                     ["Club","Recreational"],
                     ["Bus","Transportation"],
                     ["Station","Transportation"],
                     ["Airport","Transportation"],
                     ["Travel Lounge","Transportation"],
                     ["Heliport","Transportation"],
                     ["Transportation","Transportation"]]
for keyword, label in venue_type_lookup:
    venue_type["Purpose"] = np.where(venue_type["Venue Category"].str.contains(keyword),label,venue_type["Purpose"])
venue_type[venue_type["Purpose"]=="Other"]["Venue Category"].tolist()
In [ ]:
venue_type_count = venue_df[['VenueID', 'Venue Category']].drop_duplicates()
venue_type_count = venue_type_count.groupby(["Venue Category"]).count().reset_index().sort_values(by=["VenueID"],ascending=False)
venue_type_count = pd.merge(venue_type_count,venue_type,on="Venue Category")
print(venue_type_count.head(30))
In [ ]:
# show all venue locations on a map

# map centre lat,long
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'New Westminster, BC'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

station_venues = venue_df[['VenueID','Venue Latitude','Venue Longitude','Venue','Venue Category']].drop_duplicates()

# create map of rapid transit stations using latitude and longitude values
StationVenue_Map = folium.Map(location=[latitude, longitude], tiles='cartodbpositron', zoom_start=11)

# add venue markers to map
for lat, lng, venuename, venuetype in zip(station_venues['Venue Latitude'], station_venues['Venue Longitude'], station_venues['Venue'], station_venues['Venue Category']):
    label = '{}, {}'.format(venuename, venuetype)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='#d17389',
        fill=True,
        fill_color='#d17389',
        fill_opacity=0.2,
        parse_html=False).add_to(StationVenue_Map)  

print("All venues")
StationVenue_Map

Run K-mean clustering on Rapid Transit Stops

In [13]:
venue_df = pd.read_csv("ProjectData/API_rapidtransit_venues.csv")

# check the venue categories
print('{} stops in the venue dataframe.'.format(len(venue_df['stop_id'].unique())))
print('There are {} uniques categories.'.format(len(venue_df['Venue Category'].unique())))
print('There are {} uniques venues.'.format(len(venue_df['VenueID'].unique())))
print(venue_df.head())
59 stops in the venue dataframe.
There are 236 uniques categories.
There are 1523 uniques venues.
   stop_id   stop_lat    stop_lon                   VenueID  \
0    12034  49.285687 -123.111773  4aa7f561f964a520384e20e3   
1    12034  49.285687 -123.111773  57d05c9c498e185ed0205e5a   
2    12034  49.285687 -123.111773  55831809498eb7238a4bd305   
3    12034  49.285687 -123.111773  4aa6bc27f964a520df4a20e3   
4    12034  49.285687 -123.111773  4aa74686f964a520944c20e3   

                        Venue  Venue Latitude  Venue Longitude  \
0             Miku Restaurant       49.286713      -123.112044   
1                The Poke Guy       49.283937      -123.112167   
2                  Pholicious       49.284309      -123.112290   
3  Steamworks Brewing Company       49.285107      -123.111447   
4     The Fairmont Waterfront       49.287221      -123.113774   

          Venue Category  
0    Japanese Restaurant  
1             Poke Place  
2  Vietnamese Restaurant  
3                Brewery  
4                  Hotel  
In [14]:
#one hot encoding
stop_onehot = pd.get_dummies(venue_df[['Venue Category']], prefix="", prefix_sep="")
onehot_list = list(stop_onehot.columns)
stop_onehot['stop_id'] = venue_df['stop_id']

#make sure Neighborhood is the first column
fix_columns = ['stop_id'] + onehot_list
stop_onehot = stop_onehot[fix_columns]

stop_onehot.head()
Out[14]:
stop_id Accessories Store Afghan Restaurant Airport Airport Gate Airport Lounge Airport Service Airport Terminal American Restaurant Arcade ... Train Station Travel Lounge Vegetarian / Vegan Restaurant Video Game Store Vietnamese Restaurant Warehouse Store Wine Bar Wine Shop Women's Store Yoga Studio
0 12034 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 12034 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 12034 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
3 12034 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 12034 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 237 columns

In [15]:
#for each stop, get the mean of the frequency of occurance of each category
stop_group = stop_onehot.groupby('stop_id').mean().reset_index()
stop_group.head()
Out[15]:
stop_id Accessories Store Afghan Restaurant Airport Airport Gate Airport Lounge Airport Service Airport Terminal American Restaurant Arcade ... Train Station Travel Lounge Vegetarian / Vegan Restaurant Video Game Store Vietnamese Restaurant Warehouse Store Wine Bar Wine Shop Women's Store Yoga Studio
0 12034 0.010000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.010000 0.0 ... 0.01 0.000000 0.020000 0.0 0.020000 0.0 0.000000 0.0 0.000000 0.01
1 99901 0.000000 0.000000 0.020408 0.020408 0.081633 0.122449 0.020408 0.000000 0.0 ... 0.00 0.020408 0.000000 0.0 0.020408 0.0 0.020408 0.0 0.000000 0.00
2 99902 0.000000 0.000000 0.000000 0.000000 0.000000 0.200000 0.200000 0.000000 0.0 ... 0.20 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.00
3 99903 0.035714 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 ... 0.00 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.035714 0.00
4 99904 0.000000 0.010989 0.000000 0.000000 0.000000 0.000000 0.000000 0.021978 0.0 ... 0.00 0.000000 0.010989 0.0 0.032967 0.0 0.000000 0.0 0.000000 0.00

5 rows × 237 columns

In [16]:
#utility function for top x venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
In [17]:
import numpy as np
#print the top 10 most common venues near each stop

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['stop_id']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['stop_id'] = stop_group['stop_id']

for ind in np.arange(stop_group.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(stop_group.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()
Out[17]:
stop_id 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
0 12034 Coffee Shop Café Boat or Ferry Hotel Hotel Bar Restaurant Furniture / Home Store Food Truck Bar Sandwich Place
1 99901 Coffee Shop Airport Service Rental Car Location Airport Lounge Sandwich Place Duty-free Shop Gift Shop Bank Pub Public Art
2 99902 Deli / Bodega Airport Service Airport Terminal Train Station Plane Flower Shop Flea Market Food Fish Market Electronics Store
3 99903 Clothing Store Shoe Store Accessories Store Dim Sum Restaurant Café Restaurant Coffee Shop Pizza Place Outlet Mall Taco Place
4 99904 Chinese Restaurant Clothing Store Coffee Shop Japanese Restaurant Sandwich Place Bakery Bank Bubble Tea Shop Fast Food Restaurant Sushi Restaurant
In [18]:
#Cluster Neighbourhoods

# import k-means from clustering stage
from sklearn.cluster import KMeans

group_clustering = stop_group.drop('stop_id', 1)
In [19]:
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(group_clustering)
    Sum_of_squared_distances.append(km.inertia_)
In [20]:
import matplotlib.pyplot as plt
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
In [21]:
# set number of clusters
kclusters = 2

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(group_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
Out[21]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int32)
In [22]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

cluster_merge = RapidTransitStop_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
cluster_merge = cluster_merge.join(neighborhoods_venues_sorted.set_index('stop_id'), on='stop_id')

cluster_merge.head() # check the last columns!
Out[22]:
stop_id stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station ... 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
1466 12034 NaN Waterfront Station NaN 49.285687 -123.111773 ZN 1 NaN 1 NaN ... Coffee Shop Café Boat or Ferry Hotel Hotel Bar Restaurant Furniture / Home Store Food Truck Bar Sandwich Place
8858 99901 NaN YVR-Airport Station NaN 49.194174 -123.178269 ZN 2 NaN 1 NaN ... Coffee Shop Airport Service Rental Car Location Airport Lounge Sandwich Place Duty-free Shop Gift Shop Bank Pub Public Art
8859 99902 NaN Sea Island Centre Station NaN 49.192986 -123.157887 ZN 2 NaN 1 NaN ... Deli / Bodega Airport Service Airport Terminal Train Station Plane Flower Shop Flea Market Food Fish Market Electronics Store
8860 99903 NaN Templeton Station NaN 49.196688 -123.146337 ZN 2 NaN 1 NaN ... Clothing Store Shoe Store Accessories Store Dim Sum Restaurant Café Restaurant Coffee Shop Pizza Place Outlet Mall Taco Place
8861 99904 NaN Richmond-Brighouse Station NaN 49.167943 -123.136372 ZN 2 NaN 1 NaN ... Chinese Restaurant Clothing Store Coffee Shop Japanese Restaurant Sandwich Place Bakery Bank Bubble Tea Shop Fast Food Restaurant Sushi Restaurant

5 rows × 21 columns

In [23]:
#creat map
import folium
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# map centre lat,long
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'New Westminster, BC'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude


# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cluster_merge['stop_lat'], cluster_merge['stop_lon'], cluster_merge['stop_name'], cluster_merge['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
Out[23]:
In [24]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 0, cluster_merge.columns[[1] + list(range(5, cluster_merge.shape[1]))]]
Out[24]:
stop_code stop_lon zone_id stop_url location_type parent_station Cluster Labels 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
8866 NaN -123.116523 ZN 1 NaN 1 NaN 0 Park Bus Stop Track Light Rail Station Golf Course Yoga Studio Ethiopian Restaurant Flea Market Fish Market Fish & Chips Shop
8880 NaN -122.963998 ZN 2 NaN 1 NaN 0 Metro Station Bus Station Ethiopian Restaurant Food Flower Shop Flea Market Fish Market Fish & Chips Shop Financial or Legal Service Filipino Restaurant
8881 NaN -122.939192 ZN 2 NaN 1 NaN 0 Sporting Goods Shop Park Bus Stop Light Rail Station Yoga Studio Ethiopian Restaurant Flower Shop Flea Market Fish Market Fish & Chips Shop
8894 NaN -123.055908 ZN 1 NaN 1 NaN 0 Bus Station Bus Stop Bike Trail Park Bus Line Metro Station Yoga Studio Farmers Market Food Flower Shop
8895 NaN -123.046113 ZN 1 NaN 1 NaN 0 Bus Stop Park Metro Station Plaza Flower Shop Bus Station Bar Fabric Shop Flea Market Fish Market
8901 NaN -122.948980 ZN 2 NaN 1 NaN 0 Bus Stop Sandwich Place Vietnamese Restaurant Bus Station Light Rail Station Yoga Studio Flea Market Fish Market Fish & Chips Shop Financial or Legal Service
8905 NaN -122.882800 ZN 2 NaN 1 NaN 0 Bus Station Bus Stop Metro Station Storage Facility Laser Tag Fabric Shop Food Flower Shop Flea Market Fish Market
In [25]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 1, cluster_merge.columns[[1] + list(range(5, cluster_merge.shape[1]))]]
Out[25]:
stop_code stop_lon zone_id stop_url location_type parent_station Cluster Labels 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
1466 NaN -123.111773 ZN 1 NaN 1 NaN 1 Coffee Shop Café Boat or Ferry Hotel Hotel Bar Restaurant Furniture / Home Store Food Truck Bar Sandwich Place
8858 NaN -123.178269 ZN 2 NaN 1 NaN 1 Coffee Shop Airport Service Rental Car Location Airport Lounge Sandwich Place Duty-free Shop Gift Shop Bank Pub Public Art
8859 NaN -123.157887 ZN 2 NaN 1 NaN 1 Deli / Bodega Airport Service Airport Terminal Train Station Plane Flower Shop Flea Market Food Fish Market Electronics Store
8860 NaN -123.146337 ZN 2 NaN 1 NaN 1 Clothing Store Shoe Store Accessories Store Dim Sum Restaurant Café Restaurant Coffee Shop Pizza Place Outlet Mall Taco Place
8861 NaN -123.136372 ZN 2 NaN 1 NaN 1 Chinese Restaurant Clothing Store Coffee Shop Japanese Restaurant Sandwich Place Bakery Bank Bubble Tea Shop Fast Food Restaurant Sushi Restaurant
8862 NaN -123.136475 ZN 2 NaN 1 NaN 1 Chinese Restaurant Japanese Restaurant Café Coffee Shop Korean Restaurant Grocery Store Bank Fast Food Restaurant Pet Store Furniture / Home Store
8863 NaN -123.136325 ZN 2 NaN 1 NaN 1 Chinese Restaurant Food Court Shopping Mall Supermarket Bubble Tea Shop Karaoke Bar Juice Bar Tea Room Japanese Restaurant Asian Restaurant
8864 NaN -123.126062 ZN 2 NaN 1 NaN 1 Hotel Bus Stop Japanese Restaurant Harbor / Marina Coffee Shop Sandwich Place Theater Athletics & Sports Supermarket Italian Restaurant
8865 NaN -123.117015 ZN 1 NaN 1 NaN 1 Bank Sandwich Place Pharmacy Train Station Breakfast Spot Thai Restaurant Supermarket Movie Theater Fast Food Restaurant Coffee Shop
8867 NaN -123.116295 ZN 1 NaN 1 NaN 1 Electronics Store Coffee Shop Accessories Store Tea Room Toy / Game Store Fast Food Restaurant Sushi Restaurant Optical Shop Cantonese Restaurant Bank
8868 NaN -123.115450 ZN 1 NaN 1 NaN 1 Chinese Restaurant Bus Stop Coffee Shop Light Rail Station Juice Bar Dessert Shop Bubble Tea Shop Park Financial or Legal Service Filipino Restaurant
8869 NaN -123.114822 ZN 1 NaN 1 NaN 1 Coffee Shop Chinese Restaurant Japanese Restaurant Fast Food Restaurant Pizza Place Sandwich Place Ramen Restaurant Gym Café Liquor Store
8870 NaN -123.115348 ZN 1 NaN 1 NaN 1 Coffee Shop Japanese Restaurant Sushi Restaurant Café Art Gallery Grocery Store Ramen Restaurant Sporting Goods Shop Furniture / Home Store Frozen Yogurt Shop
8871 NaN -123.122079 ZN 1 NaN 1 NaN 1 Hotel Italian Restaurant Café Mexican Restaurant French Restaurant Sushi Restaurant Boat or Ferry Seafood Restaurant Ice Cream Shop Yoga Studio
8872 NaN -123.118227 ZN 1 NaN 1 NaN 1 Hotel Food Truck Coffee Shop Seafood Restaurant Bar Steakhouse New American Restaurant Dessert Shop Restaurant Bookstore
8873 NaN -123.079069 ZN 1 NaN 1 NaN 1 Coffee Shop Furniture / Home Store Gas Station Hobby Shop Ice Cream Shop Monument / Landmark Restaurant Park Hardware Store Yoga Studio
8874 NaN -123.068765 ZN 1 NaN 1 NaN 1 Coffee Shop Sushi Restaurant Pizza Place Chinese Restaurant Vegetarian / Vegan Restaurant Sandwich Place Grocery Store Ethiopian Restaurant Music Venue Bowling Alley
8875 NaN -123.045379 ZN 1 NaN 1 NaN 1 Coffee Shop Restaurant Italian Restaurant Pizza Place Sandwich Place Chinese Restaurant Office Japanese Restaurant Arts & Crafts Store Mexican Restaurant
8876 NaN -123.032867 ZN 1 NaN 1 NaN 1 Grocery Store Pizza Place Coffee Shop Burger Joint Mobile Phone Shop Bus Stop Sporting Goods Shop Big Box Store Liquor Store Sushi Restaurant
8877 NaN -123.013554 ZN 2 NaN 1 NaN 1 American Restaurant Coffee Shop Deli / Bodega Supermarket Sporting Goods Shop Gym Gym / Fitness Center Japanese Restaurant Fried Chicken Joint Paper / Office Supplies Store
8878 NaN -123.001829 ZN 2 NaN 1 NaN 1 Coffee Shop Pharmacy Sandwich Place Pizza Place Electronics Store Sushi Restaurant Gastropub Sporting Goods Shop Shopping Mall Shoe Store
8879 NaN -122.982169 ZN 2 NaN 1 NaN 1 Bus Stop Sushi Restaurant Bowling Alley Asian Restaurant Breakfast Spot Sandwich Place Light Rail Station Chinese Restaurant Chiropractor Rental Service
8882 NaN -122.918223 ZN 2 NaN 1 NaN 1 Convenience Store Warehouse Store Trail Burger Joint Sandwich Place Brewery Fish Market Fish & Chips Shop Electronics Store Filipino Restaurant
8883 NaN -122.896805 ZN 2 NaN 1 NaN 1 Korean Restaurant Fast Food Restaurant Tea Room Burger Joint Sushi Restaurant Coffee Shop Sporting Goods Shop Big Box Store Bank Chocolate Shop
8884 NaN -122.889834 ZN 3 NaN 1 NaN 1 Fast Food Restaurant Coffee Shop Pizza Place Discount Store Gas Station Korean Restaurant Thrift / Vintage Store Grocery Store Bus Stop Sandwich Place
8885 NaN -122.846034 ZN 3 NaN 1 NaN 1 Brewery Convenience Store Dessert Shop Bubble Tea Shop Seafood Restaurant Sandwich Place Salon / Barbershop Pizza Place Park Sushi Restaurant
8886 NaN -122.828187 ZN 3 NaN 1 NaN 1 Coffee Shop Frozen Yogurt Shop Bank Bar Grocery Store Japanese Restaurant Yoga Studio Burger Joint Cosmetics Shop Pet Store
8887 NaN -122.800421 ZN 3 NaN 1 NaN 1 Vietnamese Restaurant Coffee Shop Clothing Store Bookstore Electronics Store Bank BBQ Joint Burger Joint Auto Dealership Pub
8888 NaN -122.793917 ZN 3 NaN 1 NaN 1 Coffee Shop Electronics Store Fast Food Restaurant Clothing Store Sushi Restaurant Pharmacy Bubble Tea Shop Food Court Shopping Mall Dessert Shop
8889 NaN -122.791541 ZN 3 NaN 1 NaN 1 Coffee Shop Korean Restaurant Yoga Studio Japanese Restaurant Pool Park Burger Joint Metro Station Lake Gym / Fitness Center
8890 NaN -123.119557 ZN 1 NaN 1 NaN 1 Hotel Dessert Shop Coffee Shop Food Truck Clothing Store Seafood Restaurant Café Italian Restaurant Bar Sandwich Place
8891 NaN -123.115721 ZN 1 NaN 1 NaN 1 Hotel Coffee Shop Sandwich Place New American Restaurant Restaurant Taco Place Steakhouse Bar Arts & Crafts Store Café
8892 NaN -123.109181 ZN 1 NaN 1 NaN 1 Coffee Shop Hockey Arena Pizza Place Plaza Sandwich Place Pub Sushi Restaurant Bakery Circus Stadium
8893 NaN -123.100392 ZN 1 NaN 1 NaN 1 Bar Ice Cream Shop Park Italian Restaurant Sculpture Garden Science Museum Salad Place Movie Theater Circus Bakery
8896 NaN -123.031811 ZN 1 NaN 1 NaN 1 Bakery Asian Restaurant Pizza Place Coffee Shop Sushi Restaurant Café Sandwich Place Metro Station Park Distribution Center
8897 NaN -123.012670 ZN 2 NaN 1 NaN 1 Gym / Fitness Center Indian Restaurant Sandwich Place Tennis Court Greek Restaurant Japanese Restaurant Bank Portuguese Restaurant Pizza Place Mobile Phone Shop
8898 NaN -123.003920 ZN 2 NaN 1 NaN 1 Bakery Toy / Game Store Electronics Store Dessert Shop Cosmetics Shop Coffee Shop Bookstore Hotel Furniture / Home Store Asian Restaurant
8899 NaN -122.988480 ZN 2 NaN 1 NaN 1 Ramen Restaurant Dessert Shop Café Light Rail Station Noodle House Automotive Shop Gym Asian Restaurant Bubble Tea Shop Convenience Store
8900 NaN -122.959104 ZN 2 NaN 1 NaN 1 Park Coffee Shop Gym / Fitness Center Bus Station Garden Trail Café Fast Food Restaurant Filipino Restaurant Ethiopian Restaurant
8902 NaN -122.912614 ZN 2 NaN 1 NaN 1 Coffee Shop Sandwich Place Pizza Place Sushi Restaurant Tea Room Italian Restaurant Fast Food Restaurant Chinese Restaurant Burger Joint Breakfast Spot
8903 NaN -122.906175 ZN 2 NaN 1 NaN 1 Coffee Shop Sandwich Place Greek Restaurant Comedy Club Yoga Studio Pizza Place Café Rock Club Pub Cocktail Bar
8904 NaN -122.889393 ZN 2 NaN 1 NaN 1 Coffee Shop Shopping Plaza Sushi Restaurant Gastropub Café Metro Station Gourmet Shop Grocery Store Sandwich Place Bank
8906 NaN -122.874197 ZN 3 NaN 1 NaN 1 Sporting Goods Shop Lounge Light Rail Station Sandwich Place Yoga Studio Ethiopian Restaurant Flower Shop Flea Market Fish Market Fish & Chips Shop
8907 NaN -122.850610 ZN 3 NaN 1 NaN 1 Bus Station Liquor Store Yoga Studio Japanese Restaurant Pharmacy Storage Facility Light Rail Station Sushi Restaurant Sandwich Place Café
8908 NaN -122.847868 ZN 3 NaN 1 NaN 1 Coffee Shop Fast Food Restaurant Pizza Place Sandwich Place Vietnamese Restaurant Grocery Store Bank Sushi Restaurant Asian Restaurant Toy / Game Store
8909 NaN -122.844737 ZN 3 NaN 1 NaN 1 Coffee Shop Restaurant Fast Food Restaurant Shopping Mall Supermarket Sushi Restaurant Greek Restaurant Gym Furniture / Home Store Asian Restaurant
8910 NaN -122.773999 WCE2Z NaN 1 NaN 1 Pizza Place Fast Food Restaurant Bowling Alley Falafel Restaurant Farmers Market Taco Place Gift Shop Coffee Shop Grocery Store Gym / Fitness Center
8911 NaN -122.688379 WCE3Z NaN 1 NaN 1 Pizza Place Sandwich Place Coffee Shop Convenience Store Gym Farmers Market Bistro Salon / Barbershop Chinese Restaurant Pub
8912 NaN -122.666210 WCE3Z NaN 1 NaN 1 Breakfast Spot Train Station Gymnastics Gym Hardware Store Ethiopian Restaurant Fabric Shop Falafel Restaurant Farmers Market Fast Food Restaurant Duty-free Shop
8913 NaN -122.605240 WCE3Z NaN 1 NaN 1 Pub Pier Train Station Yoga Studio Duty-free Shop Flea Market Fish Market Fish & Chips Shop Financial or Legal Service Filipino Restaurant
8914 NaN -122.304898 WCE4Z NaN 1 NaN 1 Train Station Bank Gym Liquor Store Restaurant Coffee Shop Gas Station Farmers Market Falafel Restaurant Yoga Studio
8915 NaN -123.083309 ZN 2 NaN 1 NaN 1 Sushi Restaurant Gastropub Coffee Shop Greek Restaurant Pizza Place Hotel Ice Cream Shop Frozen Yogurt Shop Sandwich Place Restaurant
In [ ]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 2, cluster_merge.columns[[1] + list(range(5, cluster_merge.shape[1]))]]
In [ ]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 3, cluster_merge.columns[[1] + list(range(5, cluster_merge.shape[1]))]]
In [ ]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 4, cluster_merge.columns[[1] + list(range(5, cluster_merge.shape[1]))]]
In [ ]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 5, cluster_merge.columns[[1] + list(range(5, cluster_merge.shape[1]))]]
In [ ]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 6, cluster_merge.columns[[1] + list(range(5, cluster_merge.shape[1]))]]
In [ ]: